Part 1: Reading in the Data

First, we need to read in the data and join the training and test data.

train_data <- read_csv("reddit_stress_data/dreaddit-train.csv", show_col_types = FALSE)
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv", show_col_types = FALSE)
reddit_stress_data <- add_row(train_data, test_data)

Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.

words_tokenized <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"
head(words_tokenized)
## # A tibble: 6 x 4
##      id label subreddit word        
##   <dbl> <dbl> <chr>     <chr>       
## 1 33181     1 ptsd      suggeted    
## 2 33181     1 ptsd      rest        
## 3 33181     1 ptsd      trigger     
## 4 33181     1 ptsd      ahead       
## 5 33181     1 ptsd      youire      
## 6 33181     1 ptsd      hypocondriac

Part 2: Understanding the General Structure of the Data

Getting the Labels Distribution

label_counts <- reddit_stress_data %>%
  group_by(label) %>%
  count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")

Table of Subreddits

label_counts
## # A tibble: 2 x 2
## # Groups:   label [2]
##   label     n
##   <dbl> <int>
## 1     0  1696
## 2     1  1857

Subreddit Distribution

subreddit_counts <- reddit_stress_data %>%
  group_by(subreddit) %>%
  count()
subreddit_counts
## # A tibble: 10 x 2
## # Groups:   subreddit [10]
##    subreddit            n
##    <chr>            <int>
##  1 almosthomeless      99
##  2 anxiety            650
##  3 assistance         355
##  4 domesticviolence   388
##  5 food_pantry         43
##  6 homeless           220
##  7 ptsd               711
##  8 relationships      694
##  9 stress              78
## 10 survivorsofabuse   315
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, type = "bar")

Labels By Subreddit

# Stacked
reddit_stress_data %>%
  ggplot(aes(y=subreddit)) + geom_bar(aes(fill = as.factor(label)), position="stack")

Part 3: Visualizing Top 20 Most Common Words Among the Data

Now let’s see the most common words among the data (overall).

GetTopNMostCommonWords <- function(df, num) {
  top_word_counts <- df %>%
    count(word) %>%
    arrange(desc(n))
  return (head(top_word_counts, num))
}
num <- 15
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)

Now I will plot the rop 20 most common words in the dataset

ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")

Now let’s see how this varies among label: stressed or non-stressed.

stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)

Now let’s plot them

ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")

Now let’s see the difference among stressed data.

ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")

# Visualizing the Distribution of Words By Label

words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words) %>%
  group_by(subreddit) %>%
  count(word) %>%
  arrange(subreddit, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups:   subreddit [1]
##   subreddit      word      n
##   <chr>          <chr> <int>
## 1 almosthomeless im       76
## 2 almosthomeless %d%      69
## 3 almosthomeless job      32
## 4 almosthomeless dont     31
## 5 almosthomeless time     28
## 6 almosthomeless ive      25

Visualizing the Distribution of Words By Subreddit

words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words) %>%
  group_by(label) %>%
  count(word) %>%
  arrange(label, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups:   label [1]
##   label word       n
##   <dbl> <chr>  <int>
## 1     0 %d%     1200
## 2     0 im       659
## 3     0 time     450
## 4     0 dont     391
## 5     0 ive      331
## 6     0 people   316
words_tokenized_by_subreddit_counts %>%
  top_n(10, n) %>%
  ungroup() %>%  
  mutate(label = as.factor(label)) %>%
  arrange(label, n) %>%  
  mutate(topic_r = row_number()) %>%
  ggplot(aes(word, n, fill = label)) + 
  geom_col() +
  facet_wrap(~ label, scales = "free") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

Now let’s plot them

words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words) %>%
  group_by(subreddit) %>%
  count(word) %>%
  arrange(subreddit, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups:   subreddit [1]
##   subreddit      word      n
##   <chr>          <chr> <int>
## 1 almosthomeless im       76
## 2 almosthomeless %d%      69
## 3 almosthomeless job      32
## 4 almosthomeless dont     31
## 5 almosthomeless time     28
## 6 almosthomeless ive      25
words_tokenized_by_subreddit_counts %>%
  top_n(10, n) %>%
  ungroup() %>%  
  arrange(subreddit, n) %>%  
  mutate(topic_r = row_number()) %>%
  ggplot(aes(word, n, fill = subreddit)) + 
  geom_col() +
  facet_wrap(~ subreddit, scales = "free") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

Now looking at a subset.

words_tokenized_by_subreddit_counts <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  filter(subreddit == "almosthomeless" | subreddit == "anxiety" | subreddit == "survivorsofabuse") %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words) %>%
  group_by(subreddit) %>%
  count(word) %>%
  arrange(subreddit, desc(n))
## Joining, by = "word"
head(words_tokenized_by_subreddit_counts)
## # A tibble: 6 x 3
## # Groups:   subreddit [1]
##   subreddit      word      n
##   <chr>          <chr> <int>
## 1 almosthomeless im       76
## 2 almosthomeless %d%      69
## 3 almosthomeless job      32
## 4 almosthomeless dont     31
## 5 almosthomeless time     28
## 6 almosthomeless ive      25
words_tokenized_by_subreddit_counts %>%
  top_n(10, n) %>%
  ungroup() %>%  
  arrange(subreddit, n) %>%  
  mutate(topic_r = row_number()) %>%
  ggplot(aes(word, n, fill = subreddit)) + 
  geom_col() +
  facet_wrap(~ subreddit, scales = "free") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

Part 4: Visualizing the Distribution of Sentiment

Overall

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## Warning: Ignoring unknown parameters: bins

mx <- 0
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + geom_vline(xintercept = mx, col = "red", lwd = 1) + annotate("text", x = 0.1, y = 400, label = "Neutral")

## By Label

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label) + geom_vline(xintercept = mx, col = "red", lwd = 1) + annotate("text", x = 0.2, y = 400, label = "Neutral")

## By Subreddit

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)

By Label and Subreddit

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label, switch ="y") + theme(strip.text.y.left = element_text(angle = 0))

Now looking at a selection of these:

reddit_stress_data %>%
  filter(subreddit == "assistance" | subreddit == "stress") %>%
  ggplot(aes(x = sentiment)) + geom_boxplot(fill = "steelblue") + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label, switch ="y") + theme(strip.text.y.left = element_text(angle = 0))